In [1]:
import json
import requests
import csv
import pandas as pd
import time
import seaborn as sns
import numpy as np
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from nltk.probability import FreqDist
import re
#nltk.download('stopwords')
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import operator #module defines functions that correspond to the concept of getters. 


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')

import plotly
import plotly.express as px
import plotly.offline as pyo              #with this I will be able to plot charts offline
import plotly.graph_objects as go            #using plotly graph objects

from IPython.display import display
plotly.offline.init_notebook_mode()

import warnings
warnings.filterwarnings('ignore')
[nltk_data] Downloading package punkt to /Users/ujjwaloli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [2]:
df1 =  pd.read_csv('data1_file.csv')
df2 = pd.read_csv('data2_file.csv')
n_data = pd.read_csv("netflix_titles.csv")
print(df1.shape)
print(df2.shape)
print(n_data.shape)
(3200, 13)
(3190, 13)
(6234, 12)
Out[2]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
1 80117401 Movie Jandino: Whatever it Takes NaN Jandino Asporaat United Kingdom September 9, 2016 2016 TV-MA 94 min Stand-Up Comedy Jandino Asporaat riffs on the challenges of ra...
2 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States September 8, 2018 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob...
3 80058654 TV Show Transformers: Robots in Disguise NaN Will Friedle, Darren Criss, Constance Zimmer, ... United States September 8, 2018 2016 TV-Y7 1 Season Kids' TV When a prison ship crash unleashes hundreds of...
4 80125979 Movie #realityhigh Fernando Lebrija Nesta Cooper, Kate Walsh, John Michael Higgins... United States September 8, 2017 2017 TV-14 99 min Comedies When nerdy high schooler Dani finally attracts...
... ... ... ... ... ... ... ... ... ... ... ... ...
6229 80000063 TV Show Red vs. Blue NaN Burnie Burns, Jason Saldaña, Gustavo Sorola, G... United States NaN 2015 NR 13 Seasons TV Action & Adventure, TV Comedies, TV Sci-Fi ... This parody of first-person shooter games, mil...
6230 70286564 TV Show Maron NaN Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh... United States NaN 2016 TV-MA 4 Seasons TV Comedies Marc Maron stars as Marc Maron, who interviews...
6231 80116008 Movie Little Baby Bum: Nursery Rhyme Friends NaN NaN NaN NaN 2016 NaN 60 min Movies Nursery rhymes and original music for children...
6232 70281022 TV Show A Young Doctor's Notebook and Other Stories NaN Daniel Radcliffe, Jon Hamm, Adam Godley, Chris... United Kingdom NaN 2013 TV-MA 2 Seasons British TV Shows, TV Comedies, TV Dramas Set during the Russian Revolution, this comic ...
6233 70153404 TV Show Friends NaN Jennifer Aniston, Courteney Cox, Lisa Kudrow, ... United States NaN 2003 TV-14 10 Seasons Classic & Cult TV, TV Comedies This hit sitcom follows the merry misadventure...

6234 rows × 12 columns

In [3]:
#merging two dataframes
netflix_data = df1.append(df2)

netflix_data['country']=netflix_data['country'].fillna('usa')
netflix_data.isnull().sum()

#checking the data column values to see if we have any null values

print(n_data.isnull().sum())
print(n_data.shape)
show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64
(6234, 12)
In [4]:
#dropping some unwanted columns and renaming column name
netflix_dat = netflix_data.drop(columns=['image','largeimage','download','imdbid'], axis= 1)
netflix_dat.rename(columns={'synopsis':'description','released':'release_year'}, inplace =True)
netflix_dat
n_data.rename(columns={'show_id':'netflixid'}, inplace =True)
n_data
print()
print(netflix_dat.shape)
print(n_data.shape)
(6390, 9)
(6234, 12)
In [5]:
#check for duplicates

n_data_filter =n_data.drop_duplicates(subset=['netflixid','title'])
netflix_data_filter =netflix_dat.drop_duplicates(subset=['netflixid','title'])

print(netflix_data_filter.duplicated().any()) #again checking if any duplicates left
print(n_data_filter.duplicated().any()) # checking if any duplicates left for another dataframe

print(n_data_filter.shape)
print(netflix_data_filter.shape)
False
False
(6234, 12)
(764, 9)

My dataframes are:

In [6]:
n_data_filter.head(2)
netflix_data_filter.head(2)
Out[6]:
netflixid type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
1 80117401 Movie Jandino: Whatever it Takes NaN Jandino Asporaat United Kingdom September 9, 2016 2016 TV-MA 94 min Stand-Up Comedy Jandino Asporaat riffs on the challenges of ra...
In [7]:
## extracting dates from the date columns and making ne columns
n_data_filter["date_added"] = pd.to_datetime(n_data_filter['date_added'])
n_data_filter['added_year'] = n_data_filter['date_added'].dt.year
n_data_filter['added_month'] = n_data_filter['date_added'].dt.month
n_data_filter

#netflix_dat["unogsdate"] = pd.to_datetime(netflix_dat['unogsdate'])  #this gave me error so I had to look into the the column
#netflix_dat["date_filter"] = netflix_dat["unogsdate"].apply(lambda x: len(x))
#netflix_dat_extra = netflix_dat[netflix_dat["date_filter"] != 10]
#netflix_dat = netflix_dat.drop(3200)

netflix_data_filter["unogsdate"] = pd.to_datetime(netflix_data_filter['unogsdate']) 
netflix_data_filter['added_year'] = netflix_data_filter['unogsdate'].dt.year
netflix_data_filter['added_month'] = netflix_data_filter['unogsdate'].dt.month
netflix_data_filter
Out[7]:
netflixid title description rating type release_year runtime unogsdate country added_year added_month
0 70283264 The 100 A century after Earth was devastated by a nucl... 7.7 series 2014 NaN 2015-04-14 usa 2015 4
1 80004534 Chelsea Peretti: One of the Greats 'Brooklyn Nine-Nine' star Chelsea Pere... 7.0 movie 2014 1h14m 2015-04-14 usa 2015 4
2 80023794 Marvel and ESPN Films Present: 1 of 1: Genesis This documentary explores the connections betw... 3.8 movie 2014 1h16m 2015-04-14 usa 2015 4
3 70302184 From One Second to the Next Werner Herzog chronicles the devastating conse... 7.4 movie 2013 34m 2015-04-14 usa 2015 4
4 80011852 The Other One: The Long Strange Trip of Bob Weir This chronicle of Bob Weir highlights his brot... 7.4 movie 2015 1h23m 2015-04-14 usa 2015 4
... ... ... ... ... ... ... ... ... ... ... ...
3175 70157434 Extras Grasping hungrily for stardom, bit actor Andy ... 8.3 series 2005 NaN 2015-04-15 gb 2015 4
3176 70175670 Episodes Hoping to create an American version of their ... 7.8 series 2011 NaN 2015-04-15 gb 2015 4
3178 70269516 Parade's End This World War I-era miniseries chronicles the... 7.6 series 2012 NaN 2015-04-15 gb 2015 4
3182 70207411 Fawlty Towers Haughty Basil Fawlty and his bossy wife try to... 8.7 series 1975 NaN 2015-04-15 gb 2015 4
3191 80039379 Top Gear: The Perfect Road Trip The 'Top Gear' team motors from Italy ... 7.9 series 2013 NaN 2015-04-15 gb 2015 4

764 rows × 11 columns

In [8]:
#selecting specific columns 
rating_view = netflix_data_filter.loc[:,['netflixid','rating']]
rating_view.rename(columns={'rating':'rating_num'}, inplace =True)
#print(rating_view.shape)

#merging data frame 
merge_data = pd.merge(n_data_filter, rating_view, on='netflixid', how='inner')

#print(merge_data.shape)  #checking the rows and columns of the df

#checking for null values in each columns
merge_data.rating_num.isnull().sum()
merge_data
merge_data.isnull().sum()
Out[8]:
netflixid         0
type              0
title             0
director        191
cast             40
country          17
date_added        1
release_year      0
rating            5
duration          0
listed_in         0
description       0
added_year        1
added_month       1
rating_num       38
dtype: int64
In [11]:
merged_list = merge_data.netflixid.to_list()  #making a list of netflixid
#print(len(merged_list))#530
rating_data_list = rating_view.netflixid.to_list() #making a list of netflixid from another dataframe
#print(len(rating_data_list))#764

data_to_add =[]  #making empty list

for i in rating_data_list:    #using for loop to to get the netflixid that is not in the merged dataframe
    if i not in merged_list:
        data_to_add.append(i)   #appending list with the missed netflixid

#data_to_add  #prints list of netflixid
In [12]:
#setting index 
netflix_data_filter_set= netflix_data_filter.set_index("netflixid")
netflix_data_filter_set
Out[12]:
title description rating type release_year runtime unogsdate country added_year added_month
netflixid
70283264 The 100 A century after Earth was devastated by a nucl... 7.7 series 2014 NaN 2015-04-14 usa 2015 4
80004534 Chelsea Peretti: One of the Greats 'Brooklyn Nine-Nine' star Chelsea Pere... 7.0 movie 2014 1h14m 2015-04-14 usa 2015 4
80023794 Marvel and ESPN Films Present: 1 of 1: Genesis This documentary explores the connections betw... 3.8 movie 2014 1h16m 2015-04-14 usa 2015 4
70302184 From One Second to the Next Werner Herzog chronicles the devastating conse... 7.4 movie 2013 34m 2015-04-14 usa 2015 4
80011852 The Other One: The Long Strange Trip of Bob Weir This chronicle of Bob Weir highlights his brot... 7.4 movie 2015 1h23m 2015-04-14 usa 2015 4
... ... ... ... ... ... ... ... ... ... ...
70157434 Extras Grasping hungrily for stardom, bit actor Andy ... 8.3 series 2005 NaN 2015-04-15 gb 2015 4
70175670 Episodes Hoping to create an American version of their ... 7.8 series 2011 NaN 2015-04-15 gb 2015 4
70269516 Parade's End This World War I-era miniseries chronicles the... 7.6 series 2012 NaN 2015-04-15 gb 2015 4
70207411 Fawlty Towers Haughty Basil Fawlty and his bossy wife try to... 8.7 series 1975 NaN 2015-04-15 gb 2015 4
80039379 Top Gear: The Perfect Road Trip The 'Top Gear' team motors from Italy ... 7.9 series 2013 NaN 2015-04-15 gb 2015 4

764 rows × 10 columns

In [13]:
#making a new df and resetting index
data_add_df = netflix_data_filter_set.loc[data_to_add]
data_add_df_set = data_add_df.reset_index()
data_add_df_set.head(1)
Out[13]:
netflixid title description rating type release_year runtime unogsdate country added_year added_month
0 80004447 Horsin' Around Three little orphans - one, two, three. Withou... 9.5 movie 1987 48 2015-04-14 usa 2015 4
In [14]:
#renaming columns 
add_df_rename =data_add_df_set.rename(columns={'rating':'rating_num',
                                              'runtime':'duration'})

#merging two dataframes to include all netflixid
rating_data_overall = merge_data.append(add_df_rename, ignore_index = True) 
rating_data_overall.head(2)
Out[14]:
netflixid type title director cast country date_added release_year rating duration listed_in description added_year added_month rating_num unogsdate
0 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States 2018-09-08 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 2018.0 9.0 7.8 NaT
1 80045922 Movie 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... United States 2015-09-08 2015 NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 2015.0 9.0 5.6 NaT

Finding count of high-rated movie and low-rated movie

In [15]:
copy_count_rated = rating_data_overall.copy()   #making copy of the dataframe
copy_count_rated.rating_num = copy_count_rated.rating_num.fillna(0)  #filling null values with zero for easy processing
#copy_count_rated.rating_num.value_counts()           #counts value for specific columns 
copy_count_rated['rating_standard'] = 'N/A'  #making a new column in dataframe
copy_count_rated
Out[15]:
netflixid type title director cast country date_added release_year rating duration listed_in description added_year added_month rating_num unogsdate rating_standard
0 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States 2018-09-08 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 2018.0 9.0 7.8 NaT N/A
1 80045922 Movie 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... United States 2015-09-08 2015 NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 2015.0 9.0 5.6 NaT N/A
2 80182115 Movie Long Shot Jacob LaMendola NaN United States 2017-09-29 2017 TV-14 40 min Documentaries When Juan Catalan is arrested for a murder he ... 2017.0 9.0 7.4 NaT N/A
3 80157072 Movie Hold the Dark Jeremy Saulnier Jeffrey Wright, Alexander Skarsgård, James Bad... United States 2018-09-28 2018 TV-MA 126 min Action & Adventure, Dramas In the grim Alaskan winter, a naturalist hunts... 2018.0 9.0 5.6 NaT N/A
4 81001809 Movie Lessons from a School Shooting: Notes from Dun... Kim A. Snyder NaN United States 2018-09-28 2018 TV-PG 24 min Documentaries Two priests – one from Dunblane, Scotland, the... 2018.0 9.0 5.8 NaT N/A
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
759 70272727 series Death in Paradise NaN NaN gb NaT 2011 NaN NaN NaN Scotland Yard Detective Inspector Richard Pool... 2015.0 4.0 7.8 2015-04-15 N/A
760 70227904 movie Walking with Dinosaurs: Land of the Giants NaN NaN gb NaT 2002 NaN 29m NaN Nigel Marven travels to a time when monstrous ... 2015.0 4.0 7.5 2015-04-15 N/A
761 70269516 series Parade's End NaN NaN gb NaT 2012 NaN NaN NaN This World War I-era miniseries chronicles the... 2015.0 4.0 7.6 2015-04-15 N/A
762 70207411 series Fawlty Towers NaN NaN gb NaT 1975 NaN NaN NaN Haughty Basil Fawlty and his bossy wife try to... 2015.0 4.0 8.7 2015-04-15 N/A
763 80039379 series Top Gear: The Perfect Road Trip NaN NaN gb NaT 2013 NaN NaN NaN The 'Top Gear' team motors from Italy ... 2015.0 4.0 7.9 2015-04-15 N/A

764 rows × 17 columns

In [16]:
# my comparison standard for high-rating is 7.0 so any movie/show that is equal or more than 7.0 is high-rated and 
#any movie/show that is less than 7.0 is low-rated.

#using for loop to make a new column and specify the movie/show as high-rated or low-rated
for each in range(len(copy_count_rated.rating_num)):
    if copy_count_rated.rating_num[each] >=7.0:
        copy_count_rated['rating_standard'][each] = 'High-rated'
    elif copy_count_rated.rating_num[each] <7.0: #and copy_count_rated.rating_num[each] >=1:
        copy_count_rated['rating_standard'][each] = 'Low-rated'
        
print(copy_count_rated.rating_standard.value_counts())   #printing total number of counts of high-rated and low-rated movie
copy_count_rated.head(2)
Low-rated     401
High-rated    363
Name: rating_standard, dtype: int64
Out[16]:
netflixid type title director cast country date_added release_year rating duration listed_in description added_year added_month rating_num unogsdate rating_standard
0 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States 2018-09-08 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 2018.0 9.0 7.8 NaT High-rated
1 80045922 Movie 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... United States 2015-09-08 2015 NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 2015.0 9.0 5.6 NaT Low-rated
In [17]:
#Plotly chart - Bar chart to see the count of movies and shows in our data
#x-axis will show the content type and y-axis will show the count of content

color = ['steelblue','firebrick']
data = [go.Bar(x=['High-rated','Low-rated'],
y=[copy_count_rated.loc[copy_count_rated['rating_standard']=='High-rated'].shape[0],
   copy_count_rated.loc[copy_count_rated['rating_standard']=='Low-rated'].shape[0]],
    #marker_color =copy_count_rated['rating_num']
   marker=dict(color=color) 
    
)]
#create the layout of the chart by defining titles for chart, x-axis and y-axis
layout = go.Layout(title='Netflix content rating analysis',
            xaxis=dict(title='Type of ratings'),
            yaxis=dict(title='Total no. of ratings'),
            height =500,
            width = 700)

#Imbed data and layout into charts figure using Figure function
fig = go.Figure(data=data, layout=layout)
#Use plot function of plotly to visualize the data
fig.show()
In [18]:
#making copy to new dataframe so that the edit do not change the original dataframe
copy_count_rated_country =copy_count_rated.copy()

#replacing abbreviation of country names with their full name
copy_count_rated_country = copy_count_rated_country.replace({'country': {'gb':'United Kingdom', 'ar':'Argentina', 'hk':'Hongkong','be':'Belgium','hu':'Hungary','cz':'cezhrepublic',
                  'de':'Germany','jp':'Japan','se':'Sweden','ru':'Russia','au':'Australia','nl':'Netherland','usa':'United States','in':'India',
                  'lt':'Luthvania','br':'Brazil','mx':'Mexico','sg':'Singapore','fr':'France','kr':'South Korea','it':'Italy'}})
copy_count_rated_country.head(1)
Out[18]:
netflixid type title director cast country date_added release_year rating duration listed_in description added_year added_month rating_num unogsdate rating_standard
0 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States 2018-09-08 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 2018.0 9.0 7.8 NaT High-rated
In [19]:
#acessing specific columns
rating_analysis_country_filter = copy_count_rated_country.loc[:,['netflixid','type','title','country','rating','duration','listed_in','description','rating_num','rating_standard']]
print(rating_analysis_country_filter.shape)

rating_analysis_country_filter.head(3)
(764, 10)
Out[19]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 7.8 High-rated
1 80045922 Movie 6 Years United States NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 5.6 Low-rated
2 80182115 Movie Long Shot United States TV-14 40 min Documentaries When Juan Catalan is arrested for a murder he ... 7.4 High-rated
In [20]:
#country_high_rating_count = x
# in country column there are more than one country in a row, so making separate rows for each country 
#so creating a function and using for loop to separate the each country in different rows
def get_each_country(x):
    ids_list =[]
    countries_df=pd.DataFrame()   #making a new empty dataframe
    x['country'] = x['country'].astype(str)   #making sure that all values in country columns are in string format
    for i in range(len(x)):   #using for loop
        nid = x.iloc[i,0]                     #getting particular values
        typ = x.iloc[i,1]                       #getting particular values
        titl=x.iloc[i,2]                     #getting particular values
        value = x.iloc[i,3]                     #getting particular values
        rating_typ = x.iloc[i,4]                  #getting particular values
        dura = x.iloc[i,5]
        genre = x.iloc[i,6]
        desc = x.iloc[i,7]
        rate = x.iloc[i,8]
        rate_std = x.iloc[i,9]                       #getting particular values
        if ',' in value:                             #checking if comma is in the specific row value 
            splitted= value.split(',')         #splitting the value  separting with commas
            ids_list.append(nid)          #getting id of that value
            if len(splitted)==2:          #checking how many values were separated with commas (condition with 2 values)
                #making a list of series to append later in new dataframe
                listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) , #making a complete row
                        pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )]          #making a complete row
            elif len(splitted) ==3:         # 3 country names separated with commas
                listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) ,#making a complete row
                        pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ),#making a complete row
                        pd.Series([nid, typ, titl, splitted[2].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )]#making a complete row
            else:          #more tha three countries 
                listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) , #making a complete row
                        pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ),#making a complete row
                        pd.Series([nid, typ, titl, splitted[2].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ), #making a complete row
                        pd.Series([nid, typ, titl, splitted[3].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )]#making a complete row

            countries_df = countries_df.append(listOfSeries , ignore_index=True) #appending all these rows in a new dataframe

    #print(len(id_list))
    #print(len(country_df))
    #print(country_df)


    countries_index = x.set_index('netflixid') #setting index in the dataframe that is passed to this funntion
    #country_index

    for each in range(len(x)):                
        if x.iloc[each][0] in ids_list:    #checking if the each id is in the appended list above
            val = x.iloc[each][0]            #getting id as val
            countries_index = countries_index.drop(val,axis=0) #dropping the val(id)from the dataframe
        else:
            continue
    countries_index  =   countries_index.reset_index() #resetting the index 


    countries_index = countries_index.append(countries_df, ignore_index =True)  #appending the index 
    return countries_index    #returning the dataframe
In [21]:
rating_analysis_expand = get_each_country(rating_analysis_country_filter)  #calling above function to get each country rows separate
#print(rating_analysis_expand.shape)
rating_analysis_expand.head(3)
Out[21]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 7.8 High-rated
1 80045922 Movie 6 Years United States NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 5.6 Low-rated
2 80182115 Movie Long Shot United States TV-14 40 min Documentaries When Juan Catalan is arrested for a murder he ... 7.4 High-rated
In [22]:
#len(rating_analysis_expand.rating_standard.to_list())

#for i in range(len(rating_analysis_expand.rating_standard.to_list())):
#    print(rating_analysis_expand.rating_standard[i])
In [23]:
# getting the rows that belongs to high-rated 
country_high_rating = rating_analysis_expand[rating_analysis_expand.rating_standard == 'High-rated']
#getting with high-rated
country_high_rating_count= country_high_rating.loc[:,['netflixid','country']]
high_rating_by_country = country_high_rating_count.rename(columns={'country':'high-rating_country'})
high_rating_by_country
#counting countries that has more high-rated
ccount_high = high_rating_by_country['high-rating_country'].value_counts()
#ccount_high
In [24]:
 #getting the rows that belongs to low-rated
country_low_rating = rating_analysis_expand[rating_analysis_expand.rating_standard == 'Low-rated']
country_low_rating_count= country_low_rating.loc[:,['netflixid','country']]
#getting countries with low-rated and renaming the columns
low_rating_by_country = country_low_rating_count.rename(columns={'country':'low-rating_country'})
#counting the total number of low-rated movies in that country
ccount_low = low_rating_by_country['low-rating_country'].value_counts()
In [25]:
#concating high -rated dataframe and lowrated dataframe
rating_compare = pd.concat([ccount_high, ccount_low], axis=1)
rating_compare =rating_compare.reset_index()
#rating_compare
In [26]:
#renaming the columns of concatenated dataframe
rating_compare_e =rating_compare.rename(columns={'index':'country',
                                                    'high-rating_country':'High-rated',
                                                    'low-rating_country':'Low-rated'})

rating_compare_e.iloc[:]
Out[26]:
country High-rated Low-rated
0 United States 156.0 171.0
1 United Kingdom 53.0 12.0
2 Canada 24.0 25.0
3 Japan 20.0 38.0
4 Hongkong 14.0 19.0
5 Germany 13.0 10.0
6 India 12.0 13.0
7 cezhrepublic 11.0 12.0
8 Argentina 9.0 11.0
9 Belgium 8.0 5.0
10 Sweden 8.0 10.0
11 Australia 8.0 10.0
12 France 7.0 19.0
13 Hungary 7.0 7.0
14 Mexico 7.0 6.0
15 Russia 6.0 4.0
16 nan 5.0 12.0
17 Netherland 4.0 4.0
18 Spain 4.0 7.0
19 Denmark 3.0 NaN
20 Colombia 3.0 NaN
21 Hong Kong 3.0 7.0
22 China 3.0 4.0
23 Brazil 3.0 7.0
24 Ireland 3.0 1.0
25 Taiwan 2.0 3.0
26 New Zealand 2.0 5.0
27 ca 2.0 1.0
28 Singapore 2.0 3.0
29 Turkey 2.0 5.0
30 Ukraine 1.0 NaN
31 Israel 1.0 NaN
32 South Korea 1.0 6.0
33 is 1.0 NaN
34 sk 1.0 1.0
35 Ecuador 1.0 NaN
36 Norway 1.0 NaN
37 Bermuda 1.0 NaN
38 Egypt 1.0 NaN
39 Luthvania 1.0 1.0
40 Philippines 1.0 2.0
41 Netherlands 1.0 5.0
42 1.0 NaN
43 Italy NaN 13.0
44 th NaN 3.0
45 United Arab Emirates NaN 2.0
46 South Africa NaN 1.0
47 pt NaN 1.0
48 Pakistan NaN 1.0
49 Qatar NaN 1.0
50 gr NaN 1.0
51 Poland NaN 1.0
52 Bulgaria NaN 1.0
In [27]:
#using melt functionality to set the columna name as valuues
rating_compare_edit = rating_compare_e.melt(id_vars=['country'], value_vars=['High-rated', 'Low-rated'])
#rating_compare_edit.columns.values
#rating_compare_edit.country.to_list()


#using iloc to get specific countries
rating_compare_country = rating_compare_edit.iloc[[0,53,1,54,2,55,3,56,4,57,5,58,6,59,7,60,12,65],:]
rating_compare_country
Out[27]:
country variable value
0 United States High-rated 156.0
53 United States Low-rated 171.0
1 United Kingdom High-rated 53.0
54 United Kingdom Low-rated 12.0
2 Canada High-rated 24.0
55 Canada Low-rated 25.0
3 Japan High-rated 20.0
56 Japan Low-rated 38.0
4 Hongkong High-rated 14.0
57 Hongkong Low-rated 19.0
5 Germany High-rated 13.0
58 Germany Low-rated 10.0
6 India High-rated 12.0
59 India Low-rated 13.0
7 cezhrepublic High-rated 11.0
60 cezhrepublic Low-rated 12.0
12 France High-rated 7.0
65 France Low-rated 19.0
In [28]:
rating_compare_country
Out[28]:
country variable value
0 United States High-rated 156.0
53 United States Low-rated 171.0
1 United Kingdom High-rated 53.0
54 United Kingdom Low-rated 12.0
2 Canada High-rated 24.0
55 Canada Low-rated 25.0
3 Japan High-rated 20.0
56 Japan Low-rated 38.0
4 Hongkong High-rated 14.0
57 Hongkong Low-rated 19.0
5 Germany High-rated 13.0
58 Germany Low-rated 10.0
6 India High-rated 12.0
59 India Low-rated 13.0
7 cezhrepublic High-rated 11.0
60 cezhrepublic Low-rated 12.0
12 France High-rated 7.0
65 France Low-rated 19.0
In [29]:
#plotting the data into sunburst using plotly
fig = px.sunburst(rating_compare_country, path=['country', 'variable'], values='value', color='country',
                  title='Analysis of Netflix ratings by country', height = 700, width =900)
fig.show()
In [ ]:
# getting a glance at types of genres we have in our dataframe
rating_analysis_expand.listed_in.to_list()

rating_analysis_expand.columns.values
In [32]:
# in our datafraem we noticed that we had many genres in each row so separating it to process and analyze it.
#so using the function to separate the values by genre and making them individual rows
def get_each_genre(x):
    id_list =[]
    genre_df=pd.DataFrame()    #creaing new dataframe
    x['listed_in'] = x['listed_in'].astype(str)   #converting ito string format
    for i in range(len(x)):       #using for loop
        nid = x.iloc[i,0]  #accesing specific value
        typ = x.iloc[i,1]      #accesing specific value
        titl=x.iloc[i,2]         #accesing specific value
        country = x.iloc[i,3]        #accesing specific value
        rating_typ = x.iloc[i,4]#accesing specific value
        dura = x.iloc[i,5]          #accesing specific value
        value = x.iloc[i,6]         #accesing specific value
        desc = x.iloc[i,7]          #accesing specific value
        rate = x.iloc[i,8]          #accesing specific value
        rate_std = x.iloc[i,9]       #accesing specific value
        if ',' in value:        #checking for the comma in the value to split it accordingly
            splitted= value.split(',')  #splotting th value
            id_list.append(nid)
            if len(splitted)==2:    #making a new series for more than two generes in each row
                listOfSeries = [pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[0].strip(),desc,rate,rate_std], index=x.columns ) ,
                        pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[1].strip(),desc,rate,rate_std], index=x.columns )]
            elif len(splitted) ==3:   #making a new series for more than two generes in each row
                listOfSeries = [pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[0].strip(),desc,rate,rate_std], index=x.columns ) ,
                        pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[1].strip(),desc,rate,rate_std], index=x.columns ),
                        pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[2].strip(),desc,rate,rate_std], index=x.columns )]
            

            genre_df = genre_df.append(listOfSeries , ignore_index=True)  #appending the dataframe with new rows

    #print(len(id_list))
    #print(len(country_df))
    #print(country_df)

    
   
    
    countries_genre_index = x.set_index('netflixid')  #setting the index
    #countries_genre_index
    id_list_final = set(id_list)     #removing duplicate items in the list by making it set type
    
    for each in id_list_final:
        countries_genre_index = countries_genre_index.drop(each, axis = 0)  #dropping the rows that have been repeated in another dataframe
        
    
    countries_genre_index  =   countries_genre_index.reset_index()  #resetting index 
    countries_genre_index = countries_genre_index.append(genre_df, ignore_index =True) #appending the new dataframe
    return countries_genre_index #returning the cleaned dataframe 
In [33]:
rating_analysis_genre = get_each_genre(rating_analysis_expand)  #calling the function to process the genre columns
rating_analysis_genre
Out[33]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 7.8 High-rated
1 80182115 Movie Long Shot United States TV-14 40 min Documentaries When Juan Catalan is arrested for a murder he ... 7.4 High-rated
2 81001809 Movie Lessons from a School Shooting: Notes from Dun... United States TV-PG 24 min Documentaries Two priests – one from Dunblane, Scotland, the... 5.8 Low-rated
3 80005444 Movie Print the Legend United States TV-14 100 min Documentaries This award-winning, original documentary chron... 7.1 High-rated
4 80097321 Movie Audrie & Daisy United States TV-14 99 min Documentaries In this wrenching documentary, two teens are s... 7.2 High-rated
... ... ... ... ... ... ... ... ... ... ...
1439 70184127 TV Show Big Bad Beetleborgs Japan TV-Y7-FV 2 Seasons TV Comedies When three teens free a spirit that offers to ... 6.3 Low-rated
1440 70180088 TV Show The Garfield Show France TV-Y7 3 Seasons Kids' TV Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1441 70180088 TV Show The Garfield Show France TV-Y7 3 Seasons TV Comedies Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1442 70180088 TV Show The Garfield Show United States TV-Y7 3 Seasons Kids' TV Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1443 70180088 TV Show The Garfield Show United States TV-Y7 3 Seasons TV Comedies Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated

1444 rows × 10 columns

rating_analysis_genre.listed_in.value_counts() rating_analysis_genre_Get = rating_analysis_genre.loc[rating_analysis_genre.rating_num == 0] rating_analysis_genre_Get
In [34]:
# there are too many genres so processing them to major genres accordingly by replace functionality
rating_analysis_genre['listed_in'] = rating_analysis_genre['listed_in'].replace(['Movie','Classic Movies',
                                'Kid\'s TV', 'Teens TV','Teen TV Shows','Kids\' TV',
                                'Stand-Up Comedy','TV Comedies','Stand-Up Comedy & Talk Shows',
                                'TV Dramas','Docuseries','TV Action & Adventure','Crime TV Shows',
                                'Children & Family Movies', 'Faith & Spirituality',
                                'Independent Movies','LGBTQ Movies',
                                'Thrillers','Horror','TV Horror','TV Thrillers',
                                'TV Sci-Fi & Fantasy','Romantic Moives','Romantic TV Shows',
                                'TV Mysteries','British TV Shows','Classic & Cult TV','Korean TV Shows','Reality TV',
                        'Anime Features'],
                        ['International Movies','International Movies',
                        'Kid\'s and Teen','Kid\'s and Teen','Kid\'s and Teen', 'Kid\'s and Teen',
                        'Comedies','Comedies','Comedies',
                         'Dramas','Documentaries','Action & Adventure','Crime',
                        'Family Movies','Family Movies',
                        'Independent Movies and LGBTQ Movies','Independent Movies and LGBTQ Movies',
                        'Thrillers & Horror','Thrillers & Horror','Thrillers & Horror','Thrillers & Horror',
                        'Sci-Fi & Fantasy','Romantic','Romantic',
                        'Other Shows','Other Shows','Other Shows','Other Shows','Other Shows',
                        'Anime Series'])
In [35]:
#also replacing movie type to Movie type to make the data similar and comparable
rating_analysis_genre['type'] = rating_analysis_genre['type'].replace('movie','Movie')
In [ ]:
 
In [38]:
#using treemap to analyze the quality content of netflix in each country by genre
fig = px.treemap(rating_analysis_genre, path=['country','listed_in','rating_standard'], 
                 title ='Content analysis of each country by genre',
                  color='rating_num', hover_data=['rating_num'], 
                  color_continuous_scale='RdBu')
fig.show()

Duration analysis

In [39]:
#making a nes dataframe by selecting specific columns
duration_analysis = rating_analysis_genre.loc[:,['netflixid','type','duration','rating_standard','rating_num']]
# duration_analysis        #printing duration analysis
In [40]:
#getting specific rows where we find the movie because we are anlyzing the length of movie
movie_analysis = duration_analysis.loc[(duration_analysis['type'] =='Movie')]
 
#checking for the duplicates and drop the duplicated items in the dataframe
movie_analysis_edited = movie_analysis.drop_duplicates(subset=['netflixid'])

#splitting the duration column to actual duration and to units of duration measurrement(mins)
movie_analysis_edited[['duration_in_mins','duration_units']] = movie_analysis_edited.duration.str.split(" ",expand=True,)

#droping nan values
movie_analysis_edited = movie_analysis_edited.dropna()
#but all the values in duration column was not in same format so it wasnot easily splitted

#dataframe of high-rated movies with their duration
high_movie_analysis_edited = movie_analysis_edited[movie_analysis_edited.rating_standard == 'High-rated']
In [41]:
#duration column had different formatted values so used regular expression to ge the float numbers of out it.
s= movie_analysis_edited.duration.to_list()  #making a list
lis=[]
to_sum=[]
for each in s:
    counter = 0
    for each in re.findall(r'\d+',each):   #finding all the values using regular expression
        if counter == 0:  #setting the condition
            x =float(each)    #converting it to float
            if x ==1: 
                x = 60
            elif x ==2:
                x =120
            sume = x   #assigning the value
            #print(f"x is{x}")
            #print(f"sume is {sume}")
            
        else:
            y = float(each)   #converting to float
            #print(y)
            sume=sume+y
            #print(f"sume is {sume}")
        counter = counter +1
        #print(counter)
    lis.append(sume)  #appendig the list

movie_analysis_edited['duration_in_min'] = lis  #making a new column with the appended list to get the float numbers that represnet the duration

#movie_analysis_edited   #prints the dataframe
In [42]:
#plotting the histogram of distribution of movies with both high-standard and low-standard to analyze their distribution pattern
fig = px.histogram(movie_analysis_edited, x="duration_in_min", color="rating_standard", 
                   nbins=18,title='Distribution of duration of movies by rating standard', marginal="box") # can be `box`, `violin`)
                   
fig.show()
In [43]:
#plotting histogram to analyze the distribution of duration of high-rated movies
fig = px.histogram(high_movie_analysis_edited, x="duration_in_mins", 
                   title='Distribution of duration of high-rated movies', marginal="box",nbins=18)
fig.show()

Finding co-relation between duration and rating

In [47]:
#analyzing the realation of duration and rating
# to see if increase or decrease in length of movie would impact the rating
fig = px.scatter(movie_analysis_edited, x="duration_in_min", y="rating_num", color="rating_standard",
                 title = 'Analysis of co-relation between rating and duration',
                  hover_data=['type'])
fig.show()
In [49]:
#finding co-relation to see if the duration variable and rating variable are co-related
from scipy.stats import pearsonr 
  
# Convert dataframe into series 
list1 = movie_analysis_edited['duration_in_min'] 
list2 = movie_analysis_edited['rating_num'] 
  
# Apply the pearsonr() 
corr, _ = pearsonr(list1, list2) 
print('Pearsons correlation: %.3f' % corr)

correlation = list1.corr(list2)
correlation
Pearsons correlation: 0.180
Out[49]:
0.18002051843301892

Rating class popularity

In [50]:
# n_data_filter.head(1)   #taking  a glance at the dataframe

#accessing specifc columns 
data_rating_class= n_data_filter.loc[:,['netflixid','rating','country']]
data_rating_class.head(1)
Out[50]:
netflixid rating country
0 81145628 TV-PG United States, India, South Korea, China
In [51]:
# lets see which type of rating movie or TV show the netflix has the most
rating_value_counts = data_rating_class.rating.value_counts()
rating_value_counts
Out[51]:
TV-MA       2027
TV-14       1698
TV-PG        701
R            508
PG-13        286
NR           218
PG           184
TV-Y7        169
TV-G         149
TV-Y         143
TV-Y7-FV      95
G             37
UR             7
NC-17          2
Name: rating, dtype: int64

aliceblue, antiquewhite, aqua, aquamarine, azure, beige, bisque, black, blanchedalmond, blue, blueviolet, brown, burlywood, cadetblue, chartreuse, chocolate, coral, cornflowerblue, cornsilk, crimson, cyan, darkblue, darkcyan, darkgoldenrod, darkgray, darkgrey, darkgreen, darkkhaki, darkmagenta, darkolivegreen, darkorange, darkorchid, darkred, darksalmon, darkseagreen, darkslateblue, darkslategray, darkslategrey, darkturquoise, darkviolet, deeppink, deepskyblue, dimgray, dimgrey, dodgerblue, firebrick, floralwhite, forestgreen, fuchsia, gainsboro, ghostwhite, gold, goldenrod, gray, grey, green, greenyellow, honeydew, hotpink, indianred, indigo, ivory, khaki, lavender, lavenderblush, lawngreen, lemonchiffon, lightblue, lightcoral, lightcyan, lightgoldenrodyellow, lightgray, lightgrey, lightgreen, lightpink, lightsalmon, lightseagreen, lightskyblue, lightslategray, lightslategrey, lightsteelblue, lightyellow, lime, limegreen, linen, magenta, maroon, mediumaquamarine, mediumblue, mediumorchid, mediumpurple, mediumseagreen, mediumslateblue, mediumspringgreen, mediumturquoise, mediumvioletred, midnightblue, mintcream, mistyrose, moccasin, navajowhite, navy, oldlace, olive, olivedrab, orange, orangered, orchid, palegoldenrod, palegreen, paleturquoise, palevioletred, papayawhip, peachpuff, peru, pink, plum, powderblue, purple, red, rosybrown, royalblue, rebeccapurple, saddlebrown, salmon, sandybrown, seagreen, seashell, sienna, silver, skyblue, slateblue, slategray, slategrey, snow, springgreen, steelblue, tan, teal, thistle, tomato, turquoise, violet, wheat, white, whitesmoke, yellow, yellowgreen

In [52]:
#setting colors
color = ['salmon', 'firebrick', 'aqua', 'mediumorchid', 'orangered',
           'limegreen', 'gold', 'tomato', 'magenta', 'blue',
            'blueviolet', 'brown', 'burlywood', 'cadetblue',
            'chartreuse']
#using bar plotly
data = [go.Bar(x=['TV-MA','TV-14','TV-PG','R','PG-13','NR','PG','TV-Y7','TV-G','TV-Y','TV-Y7-F7','G','UR','NC-17'],

#setting y to be value count of each rating type
y=[data_rating_class.loc[data_rating_class['rating']=='TV-MA'].shape[0],  
   data_rating_class.loc[data_rating_class['rating']=='TV-14'].shape[0],
   data_rating_class.loc[data_rating_class['rating']=='TV-PG'].shape[0],
    data_rating_class.loc[data_rating_class['rating']=='R'].shape[0],
    data_rating_class.loc[data_rating_class['rating']=='PG-13'].shape[0],
    data_rating_class.loc[data_rating_class['rating']=='NR'].shape[0],
    data_rating_class.loc[data_rating_class['rating']=='PG'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='TV-Y7'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='TV-G'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='TV-Y'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='TV-Y7-F7'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='G'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='UR'].shape[0],
               data_rating_class.loc[data_rating_class['rating']=='NC-17'].shape[0]],
   marker=dict(color=color) 
    
)]
#create the layout of the chart by defining titles for chart, x-axis and y-axis
layout = go.Layout(title='Netflix content rating analysis',
            xaxis=dict(title='Type of ratings'),
            yaxis=dict(title='Total no. of ratings'),
            height =500,
            width = 700)

#embedding data and layout into charts figure using Figure function
fig = go.Figure(data=data, layout=layout)
#Use plot function of plotly to visualize the data
fig.show()
In [53]:
rating_analysis_expand.head(2)  #printing two rows of data frame to see what the data frame looks like
Out[53]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 7.8 High-rated
1 80045922 Movie 6 Years United States NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 5.6 Low-rated
In [54]:
#making a copy of dataframe
rating_analysis_by_country = rating_analysis_expand.copy()

#filling empty values with not available for ease of analysis
rating_analysis_by_country=rating_analysis_by_country.fillna('N/A')

#checking if the dataframe now has null values
rating_analysis_by_country.isnull().sum()
Out[54]:
netflixid          0
type               0
title              0
country            0
rating             0
duration           0
listed_in          0
description        0
rating_num         0
rating_standard    0
dtype: int64
In [55]:
#plotting sunbrust plotly to see the type of rating type by the country
fig = px.sunburst(rating_analysis_by_country, path=['country', 'rating'],  color='country',
                  title='Analysis of Netflix ratings by country', height = 650, width =900)
fig.show()

now that we have analyzed the rating type, since we also have data of title, cast, directors lets analyze these pieces of incormation and see if we can come up with some insights

High-rated titles

In [56]:
# coming up with some popular words among all titles for high-rated movie/shows
country_high_rating.head(1)
#country_high_rating[country_high_rating.duplicated(['netflixid'])]
Out[56]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 7.8 High-rated
In [57]:
#opening image in numpy array format to shape word cloud in this saved image
#to do that we need to check the intensity of pixels, which acn be done by opening image in numpy format
TV_mask = np.array(Image.open("TV.jpg")) 
#TV_mask
In [188]:
#mask has to be in 255 pixels to use it in word cloud 
# our mask is in correct form so, let's create text for wordcloud
text = " ".join(country_high_rating['title'])

#creating a word cloud image
wc = WordCloud(background_color='black',max_words =500, mask = TV_mask, 
               contour_width =3, contour_color = 'red')

#generate a wordcloud
wc.generate(text)

#show
plt.figure(figsize=[20,10])
plt.title("Wordcloud for popoular words in titles", fontsize =30)
plt.imshow(wc)
plt.axis('off')
plt.show()

High-rated directors

In [61]:
# Similarly some popular directors from high rated movies/shows

#accessing specific columns
rated_directorANDactor = copy_count_rated.loc[:,['director','cast','rating_standard','title']]

#accessing specific rows
high_rated_directorANDactor = rated_directorANDactor.loc[rated_directorANDactor.rating_standard == 'High-rated']

#dropping nan values
high_rated_director = high_rated_directorANDactor.director.dropna()
#high_rated_director
In [64]:
# our mask is in correct form so, let's create text for wordcloud
text = " ".join(high_rated_director)

#creating a word cloud image with maximum words of 200 
wc = WordCloud(background_color='black',max_words =200, mask = TV_mask, 
               contour_width =3, contour_color = 'red')

#generate a wordcloud
wc.generate(text)

#show
plt.figure(figsize=[25,8])
plt.title("Popular directors", fontsize =25)
plt.imshow(wc)
plt.axis('off')
plt.show()
Most common 4 director names with High rated movies are: 1. Shannon Hartman 2. Jay Karas 3. Lilly Wachowski, Lana Wachowski 4. Mike Clattenburg
In [65]:
#having some information related with director Shannon Hartman
high_rated_directorANDactor[high_rated_directorANDactor['director'].str.match('^Shannon Hartman*')== True][:2]
Out[65]:
director cast rating_standard title
60 Shannon Hartman Kevin Hart High-rated Kevin Hart: Seriously Funny
78 Shannon Hartman, Michelle Caputo Donald Glover High-rated Donald Glover: Weirdo

most probably do not use the visualization for description wordcloud so its in nbc convert

# our mask is in correct form so, let's create text for wordcloud text = " ".join(country_high_rating['description']) #creating a word cloud image wc = WordCloud(background_color='black',max_words =1000, mask = TV_mask, contour_width =3, contour_color = 'red') #generate a wordcloud wc.generate(text) #show plt.figure(figsize=[20,10]) plt.imshow(wc) plt.axis('off') plt.show()rated_directorANDactor
In [ ]:
 

Pattern for content description

I would also like to analyze if there is any specific pattern in description of movie/show. Can we tell which genre will that movie/show belong by looking at the descritption? If yes, are there any commonalities we can find in that gerne's?

genre nlp try

In [66]:
rating_analysis_genre
Out[66]:
netflixid type title country rating duration listed_in description rating_num rating_standard
0 70234439 TV Show Transformers Prime United States TV-Y7-FV 1 Season Kid's and Teen With the help of three human allies, the Autob... 7.8 High-rated
1 80182115 Movie Long Shot United States TV-14 40 min Documentaries When Juan Catalan is arrested for a murder he ... 7.4 High-rated
2 81001809 Movie Lessons from a School Shooting: Notes from Dun... United States TV-PG 24 min Documentaries Two priests – one from Dunblane, Scotland, the... 5.8 Low-rated
3 80005444 Movie Print the Legend United States TV-14 100 min Documentaries This award-winning, original documentary chron... 7.1 High-rated
4 80097321 Movie Audrie & Daisy United States TV-14 99 min Documentaries In this wrenching documentary, two teens are s... 7.2 High-rated
... ... ... ... ... ... ... ... ... ... ...
1439 70184127 TV Show Big Bad Beetleborgs Japan TV-Y7-FV 2 Seasons Comedies When three teens free a spirit that offers to ... 6.3 Low-rated
1440 70180088 TV Show The Garfield Show France TV-Y7 3 Seasons Kid's and Teen Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1441 70180088 TV Show The Garfield Show France TV-Y7 3 Seasons Comedies Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1442 70180088 TV Show The Garfield Show United States TV-Y7 3 Seasons Kid's and Teen Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated
1443 70180088 TV Show The Garfield Show United States TV-Y7 3 Seasons Comedies Lazy, lasagna-loving fat cat Garfield lives li... 5.6 Low-rated

1444 rows × 10 columns

In [67]:
#getting speciifc columns needed for analysis
rating_analysis_genre_edit =rating_analysis_genre.loc[:,['netflixid','listed_in','description','rating_standard']]

#looking for duplicates and keeping the first records if there are any duplicates
analysis_genre = rating_analysis_genre_edit.drop_duplicates(subset='netflixid',keep= 'first')
#print(analysis_genre.shape) #will print the shape of data frame
#analysis_genre

#analysis_genre.listed_in.to_list()    prints out the list of genres
In [68]:
#setting listed_in column as index
genre_pattern = analysis_genre.set_index('listed_in')

#getting popular genre descriptions like comedies, Action and adventures, Horror movies and Romantic
comedies_pattern = genre_pattern.loc[['Comedies']]
Action_Adventures_pattern =  genre_pattern.loc[['Action & Adventure']]
Horror_pattern = genre_pattern.loc[['Horror Movies']]
Romantic_pattern = genre_pattern.loc[['Romantic']]
In [69]:
# defining a function to get popular words from thd description of that specific genre 
#nltk.download('stopwords')
from nltk.corpus import stopwords

def get_pattern(x):
    text = " ".join(x['description'])  #joining all the values of Description COLUMN as a text
    
    # using word_tokenize() for splitting strings into tokens (nominally words). 
    #It splits tokens based on white space and punctuation. For example, commas and periods are taken as separate tokens.
    textwords = word_tokenize(text)       
    
    #setting stopwords 
    stop_words = set(stopwords.words("english"))
    stop_words.update(['series','finds'])  #updating stopwords because as these words might not have specifc meaning
    exclud_punc=[]  #making a new list
    finl_words=[]  #making a new list
    for w in textwords:        #using a for loop in tokenized text
        if w.isalpha():        # checking whether a character is an alphabet or not
            exclud_punc.append(w.lower())    #making sure that all the strings are lower case and appeding to the new list and 
    for word in exclud_punc:                 # using for loop in the appended list
        if word not in stop_words:          #checking if the words are contained in stop_words
            finl_words.append(word)         #appedning the filtered words, which are not in stop words

    return finl_words       #returning the appended list
In [70]:
comedy_word_list = get_pattern(comedies_pattern)    #calling a function to get popular words for comedy genre
action_Adv_list = get_pattern(Action_Adventures_pattern) #calling a function to get popular words for action and adventure genre
horror_list =get_pattern(Horror_pattern)  #calling a function to get popular words for horror genre
romantic_list = get_pattern(Romantic_pattern) #calling a fucntion to get popular words for romantic genre
I don't need this fdist_comedy= FreqDist(comedy_word_list) fdist_act_Adv= FreqDist(action_Adv_list) fdist_horror = FreqDist(horror_list) print(fdist_horror.most_common(20))
In [71]:
#w1 = WordCloud(max_font_size=50, max_words=150, colormap="Oranges_r").generate(horror_list_wc)
#wordcloud2 = WordCloud().generate(action_Adv_wc)

#making a list of the lists
list_words = [horror_list,action_Adv_list, comedy_word_list, romantic_list]

#making a list for the tile
title=['horror genre','action and adventure genre','comedy genre','romantic genre']
j =0   #setting the counter
for i in list_words:   #using for loop in the lists
    text = " ".join(i)   #joining all the words from the list and making it like whole bag of words  
    w1 = WordCloud(max_font_size=50, max_words=150, colormap="Oranges_r").generate(text)  #generating wordcloud
    plt.figure(figsize = (10, 8))
    plt.imshow(w1)
    plt.title(f"Popoular words for {title[j]}", fontsize=20)  #setting title
#plt.imshow(wordcloud2)
    plt.axis("off")
    plt.show() #plotting
    j= j+1  #increasing counter
In [ ]:
 
In [ ]:
 

Pattern for sentiments of title

I have seen some people who are only tempted to see the movie which have positive sentiments. Like my aunt only watches the movie which gives her positive influence. She reads the description of movie and if the movie description has word like violence (negative vibes), she will ignore that. So lets analyze the sentiment of the description of title.

In [72]:
#getting specific columns from the dataframe
copy_count_rated_country
synopsis_analysis = copy_count_rated_country.loc[:,['description', 'rating_num','rating_standard','rating']]
synopsis_analysis = synopsis_analysis.set_index(['description'])  #setting index
synopsis_analysis = synopsis_analysis.reset_index()  #resetting index
synopsis_analysis.head(2) 
Out[72]:
description rating_num rating_standard rating
0 With the help of three human allies, the Autob... 7.8 High-rated TV-Y7-FV
1 As a volatile young couple who have been toget... 5.6 Low-rated NR
In [73]:
#making a new list
value_list =[]  
sentiment_list=[]

#using for loop to iterate
for each in range(len(synopsis_analysis)):
    try:
        text = synopsis_analysis['description'][each]  #getting specific value of description column 
        analysis = TextBlob(text)   #using python library to process the textual data.
        value = analysis.sentiment.polarity  #analyzing the sentiment of text
        value_list.append(value)
        if value > 0:                  #setting the condition if the sentiment is greater than 0 to be positive
            sentiment = 'positive'
        elif value == 0:           #setting the condition if the sentiment is 0 to be neutral
            sentiment ='neutral'
        else:                        #setting the condition if the sentiment is lesser than 0 to be negative
            sentiment ='negative'
        sentiment_list.append(sentiment)  #appending sentiment values i.e. either positive, neutral or negative to the list
    except:
        continue

#print(value_list)  #prints the list
#print(len(sentiment_list))
In [74]:
# making the list into the column of dataframe
synopsis_analysis['sentiment'] = sentiment_list

#dropping nan values of the dataframe
sentiment_analysis = synopsis_analysis.dropna()


# calculating counts of sentiment movies/show 
count_sentiments = sentiment_analysis.sentiment.value_counts()
In [75]:
#making a function to calculate the percent of sentiment values(i.e. either positive/neutral/negative)
def get_sentiment_count(df):
    y_count = df.sentiment.value_counts().reset_index()  #counting the values and restting the index
    y_count =y_count.rename(columns={'index': "sentiment_value","sentiment":"counts"})  #renaming the columns
    total = sum(y_count['counts'])  #calcualting the sum 
    y_count['percent'] = y_count['counts'].apply(lambda x: (x/total)*100 )  #applying the percent function using lambda
    return y_count #returning the dataframe

high_count = get_sentiment_count(sentiment_analysis)
In [76]:
#plotting the percentage of sentiment for high-rated movies
fig = go.Figure(data=[go.Pie(labels=high_count['sentiment_value'], values=high_count['percent'], hole=.3)])

fig.update_layout(
    title_text="Synopsis sentiment of high-rated movies",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='sentiments', x=0.50, y=0.5, font_size=14, showarrow=False)])

fig.show()

#this is good news for people like my aunt, as she prefers to watch the movie that has positive sentiments. 
In [77]:
#GETTING sentiment data for high-rated
high_rated = sentiment_analysis[sentiment_analysis.rating_standard == 'High-rated']

#getting sentiment data for low-rated
low_rated = sentiment_analysis[sentiment_analysis.rating_standard == 'Low-rated']

#dataframe with high positive sentiments among high-rated movies
only_high_pos = high_rated[high_rated['sentiment'] == 'positive']

#dataframe with high negative sentiments among high-rated movies
only_high_neg = high_rated[high_rated['sentiment'] == 'negative']
In [78]:
#finding popular words to look for high-rated movie/shows with positive sentiments
high_pos_words = get_pattern(only_high_pos)    #calling a previous function

#finding popular words to look for high-rated movie/shows with negative sentiments
high_neg_words = get_pattern(only_high_neg)#calling a previous function
In [79]:
#plotting word cloud that shows the popoular words to look to find high-rated positive sentiment movie/show
text = " ".join(high_pos_words)  #joining all the the words

stopwords = set(STOPWORDS)  #setting stopwords
stopwords.update(["series",'show'])  #updating stopwords with series and show as these words might not be that significant
#creating a word cloud image
wc = WordCloud(stopwords =stopwords, background_color='white',max_words =1000)
#generate a wordcloud
wc.generate(text)

#show
plt.figure(figsize=[15,7])
plt.title("Positive words to find high-rated movies with positive-sentiment", fontsize = 30)

plt.imshow(wc)
plt.axis('off')
plt.show()

Recommendation according to title, type, cast, director, listed_in

Using countvectorizer and cosine similarity to provide some recommendations according to title, type, cast, director and genre.

Countvectorizer is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. And ‘cosine_similarity’ is used to find the similarity. Using cossine similarity means to calculate the cosine of the angle between two vectors. It does not mean finding straight line distance between two points

In [80]:
#getting specific columns from the dataframe
new_df_recommend = n_data_filter.loc[:,['type','title','director','cast','listed_in']]
new_df =new_df_recommend.copy()  #making a copy of the dataframe
new_df = new_df.fillna('') #filling na values 
#new_df
In [81]:
#defining a function to clean data according to the need
def clean_data(x):
#Check if string exists. If not, return empty string
    if isinstance(x, str): 
        return str.lower(x.replace(" ", "")) #making string lowercase and replacing whitespace
    else:
        return ''
In [82]:
# using for loop and calling the function to clean the data
columns = ['type','title','director','cast','listed_in']
for col in columns:
    new_df[col] = new_df[col].apply(clean_data)
In [83]:
#defining function that comines all the columns together
def combine_cols(x):
    return ''.join(x['type']) + ' ' + ''.join(x['title']) + ' ' + x['director'] + ' ' + ''.join(x['cast'])+' ' + '' .join(x['listed_in'])

#applying the defined functions
new_df['combined_cols'] = new_df.apply(combine_cols , axis=1)
# new_df['combined_cols']
In [179]:
#making another recommend function  using countvectorizer 
def recommend_func(x,user_title,new_df_recommend):
    
    #calling countvectorizer and using stop_words functionality to remove the unwanted words
    count_convert_vector = CountVectorizer(stop_words='english')
    
    #Convert a collection of text documents to a matrix of counts
    count_convert_vector_matrix = count_convert_vector.fit_transform(x['combined_cols'])

    # Compute the Cosine Similarity matrix based on the matrix of counts
    from sklearn.metrics.pairwise import cosine_similarity

    #calling a cosine_similarity function and passing the counts 
    similarity_metric = cosine_similarity(count_convert_vector_matrix, count_convert_vector_matrix)

    # Reset index of your main DataFrame and construct reverse mapping as before
    try:
        x = x.reset_index()
        index_col = pd.Series(x.index, index=x['title'])
    except:
        index_col = pd.Series(x.index, index=x['title'])
    
    
    ################      recommendations_title_by_genre(user_title, similarity_metric)
    user_title =user_title.replace(' ','').lower()

    # Get the index of the movie that matches the title
    index_val = index_col[user_title]
    
    print("The 5 shows related to your title are:\n")


    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = enumerate(similarity_metric[index_val])


    # Sort the movies based on the similarity scores
    #using the sort function to filter the scores and arrange them in descending order
    similarity_scores = sorted(similarity_scores, key=lambda a: a[1], reverse=True)
    
    # Get the scores of the 5 most similar movies
    similarity_scores = similarity_scores[1:6]
    
    
   
    # Get the movie indices
    movie_index_val=[]
    #usig for loop to get the scores and appending it to new list 
    for similar in similarity_scores:
        value = similar[0]
        movie_index_val.append(value)
    

    # Return the top 5 most similar movies
    my_list = new_df_recommend['title'].iloc[movie_index_val]
    for my in my_list:  #using for loop to print one by one
        print(my)
In [180]:
#defining another function to get the title that the user chose
def get_df_title_genre(new_df,user_genre, user_title):
    #replacing the whitespaces because our all the titles and other features are combined and removed whitespaces for processing
    user_genre = user_genre.replace(' ','').lower()
    #getting data frame with specific genre from listed_in columns
    x = new_df[new_df.listed_in == user_genre]
    
    #calling the function and passing the dataframe, title and genre to the function
    recommend_func(x,user_title,new_df_recommend)

def get_df_title(new_df,user_title): #defining the function that will get the title 
    x = new_df.copy()  #making the new dataframe
    recommend_func(x,user_title,new_df_recommend)#calling the function and passing the title name, data frame 
    
In [181]:
def choose_genre(user_genre): #defining the function to let the user choose genre
    
    # getting the specific genre according to user choice
    movie_names =new_df_recommend[new_df_recommend['listed_in']== user_genre]
    print(user_genre) #printing the user genre choice
    name_list = movie_names['title'].to_list() #getting the title list of titles from the genre chose
    print(name_list[0:3]) #printing three title names based on the genre chose
In [186]:
#defining the menu function 
def menu():
    while True:
        #getting the input from users
        inp = input("How would you like to get the recommendations? \n"
                        "      Enter 'T' if you would like to get recommendation just by title:\n"
                        "      Enter 'G' if you would like to get recommendation with genre and title:\n"
                        "      Enter 'N' for no recommendations and to quit:\n").upper()

        if inp == 'T': #setting the condition
            user_input=input("Enter the title name to get some recommendation: ")  #asking the input
            
            print(f"Your title is:{user_input} ") #printing the title or the input that the user entered
            print()  #leaving one line space
            print("The 5 shows related to your title are: \n") #print prompt
            get_df_title(new_df,user_input) #calling the function that passes the user title and dataframe

        elif inp =='G':  #setting another condition
            print("Genre options:\n"  #printing the prompt
                  "    (Enter) 'A'for 'Documentaries'\n"
                  "    (Enter) 'B' for 'Stand-Up Comedy'\n"
                  "    (Enter) 'C' for 'Dramas, Independent Movies, International Movies'\n"
                  '    (Enter) "D" for "Kids\' TV "\n'
                  "    (Enter) 'E' for 'Dramas, International Movies, Romantic Movies'\n"
                  "    (Enter) 'F' for 'Action & Adventure, Sci-Fi & Fantasy'\n"
                  "    (Enter) 'G' for 'Horror Movies, Thrillers'\n")
                 

            user_genre = input("Enter the letter for the genre you would like:\n ").upper() # prompt for user input
            list_options = ['A','B','C','D','E','F','G']  #setting the list with user options
            
            #list of genre options
            genre_options = ['Documentaries','Stand-Up Comedy','Dramas, Independent Movies, International Movies', "Kids\' TV",
                             'Dramas, International Movies, Romantic Movies','Action & Adventure, Sci-Fi & Fantasy','Horror Movies, Thrillers']
            #setting the condition and checking if the user options is in list options
            if user_genre in list_options:
                i = list_options.index(user_genre)   #getting the index of user genre
                genre_name = genre_options[i]      
                print(f"You selected {genre_name} genre.\n")   #printing the genre name
                choose_genre(user_genre = genre_name) #calling the function that will choose genre
            else:
                print()
                #print prompt
                print("You should select the above genre options. Please select genre from above options")
                menu() #if the user did not enter the input from the menu provided call the menu function again

            user_title =input("Enter the title name to get some recommendation like that: ") #getting title input
            print()
        
            
            try:
                get_df_title_genre(new_df,genre_name, user_title) #calling the title function passing the user chose title and the dataframe
            except Exception:
                print("The title could not be found. Please enter other title name.")
                menu()


        elif inp == 'N':  #setting condition to validate the user choice
            return "Thank you!!!!!"
            break  #breaking the loop if user entered the invalid input
            
        
        else:
            print("Please select the correct options.")
            continue  #continuing the loop to provide the user to continue with the program
In [187]:
menu()  #calling the function
How would you like to get the recommendations? 
      Enter 'T' if you would like to get recommendation just by title:
      Enter 'G' if you would like to get recommendation with genre and title:
      Enter 'N' for no recommendations and to quit:
n
Out[187]:
'Thank you!!!!!'
In [ ]:
 

Predicting Rating according to classification (machine learning technique)

In [89]:
#n_data_filter.head(1)  #taking a glance of the dataframe
In [90]:
# getting specific columns from the dataframe
data_knn = n_data_filter.loc[:,['netflixid','title','director','cast','rating']]

#dropping the nan values from the dataframe
data_knn = data_knn.dropna()
#data_knn   #looking how the dataframe looks like

Now we want to preidict the rating on based on cast, director, and rating category (TVMA, TV-14) of the movie/show. This technique is also called the supervised machine learning because we actually know what we want to predict. We know that all the features we are going to choose is in string format to we have to convert them to binary for classification

In [91]:
#converting the cast names to binary

#making a empty list
castList = []

#using the for loop to loov over the cast column of dataframe to get the speciifc cast names
for ind, record in data_knn.iterrows():
    cast = record["cast"]
    
    # getting each cast names froma all the cast list
    for i in cast:
        #validating if the cast name is in the list because we do not want the same cast name to be repeated
        if i not in castList:  
            castList.append(i)  #only appending the cast name that is not in the list 


#print(castList)
In [92]:
# once we have each cast name we have to convert it to 1 or 0's
#so we define the function called convert
def convert(x_list, data_list):
    
    #made a new list so that we will append it later
    convert_to_binary=[]
    
    #using for loop from the list passed 
    for y in data_list:
        if y in x_list:  #setting condition to check if the list contains the items
            convert_to_binary.append(1) #so if the item was in the list convert that item to 1
        else:
            convert_to_binary.append(0)  # if the item was not in the list convert it to 0.
        
    return convert_to_binary #returning the list

#applying the function for cast column of dataframe because we wanted to convert the cast column to binary
data_knn['cast_binary'] = data_knn['cast'].apply(lambda x: convert(x,castList)) 
data_knn['cast_binary'].head(2)
Out[92]:
0    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
Name: cast_binary, dtype: object
In [93]:
# making a director list so that we will have director names and append it to this list
directorList=[]
for i in data_knn['director']:  #using for loop in director columns
    if i not in directorList:  #validating if the director name is not repeated
        directorList.append(i)   #appending the director names in the list
        
        
#calling a function and making a new column in dataframe with 1 and 0 values      
data_knn['director_binary'] = data_knn['director'].apply(lambda x: convert(x, directorList))
data_knn.head(2)
Out[93]:
netflixid title director cast rating cast_binary director_binary
0 81145628 Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... TV-PG [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4 80125979 #realityhigh Fernando Lebrija Nesta Cooper, Kate Walsh, John Michael Higgins... TV-14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ... [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
In [94]:
#also we are using rating category like: TV-MA, TV-14 to predict the rating so
#making a new description list

rating_list = []

#using for loop to get the row and column value of the dataframe
for ind, record in data_knn.iterrows():
    x = record["rating"]
    
    #using for loop to get each rating category
    for each in x:
        if each not in rating_list: #validating that the rating category is not repeated for same record
            rating_list.append(each)

# calling the function to convert it to binary and making a new column 
data_knn['rating_binary'] = data_knn['rating'].apply(lambda x: convert(x,rating_list))
data_knn.head(2)
Out[94]:
netflixid title director cast rating cast_binary director_binary rating_binary
0 81145628 Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... TV-PG [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4 80125979 #realityhigh Fernando Lebrija Nesta Cooper, Kate Walsh, John Michael Higgins... TV-14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ... [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...

spatial is python module is used to find the distance between two points. So when we imported spatial and used cosine distance modeule, it finds the distance between two points based on cosine. Cossine distance is based on cosine similarity. Cosine similarity is finding the distance based on cos-angle drawn by those two points. For example, if two points lie in same vector or nearer, the distance between them is nominal which increases the similarity or vice-versa. Thus this concept is applied to find the recommendation system as it finds the similarity between the features (casting,director and so on).

In [111]:
from scipy import spatial

#defining the fuunction that finds the similarity between two features
def Similarity(firstid, secondid):
    #getting specific rows
    first_set = data_knn.iloc[firstid] 
    second_set = data_knn.iloc[secondid]
    
    #getting the column data of cast binary of two sets
    first_cast = first_set['cast_binary']
    second_cast = second_set['cast_binary']
    
   # Compute the Cosine distance of 1-D array between first_cast and second_cast
    cast_distance = spatial.distance.cosine(first_cast, second_cast)
    
    
    #getting the column data of director binary of two sets
    first_director = first_set['director_binary']
    second_director = second_set['director_binary']
    
    # Compute the Cosine distance of 1-D array between first_director and second_director
    director_distance = spatial.distance.cosine(first_director, second_director)
    
    #getting the column data of rating binary of two sets
    first_rating = first_set['rating_binary']
    second_rating = second_set['rating_binary']
    
    # Compute the Cosine distance of 1-D array between first_rating and second_rating
    rating_distance = spatial.distance.cosine(first_rating, second_rating)
    
    return director_distance + cast_distance + rating_distance # returns total distance between the two data 
In [112]:
# for example distance bewteen first row and hundreth row is calculated
Similarity(0,99)

# to understand more.....the data for the particular rows are printed
print(data_knn.iloc[0])
print(data_knn.iloc[99])
netflixid                                                   81145628
title                        Norm of the North: King Sized Adventure
director                                    Richard Finn, Tim Maltby
cast               Alan Marriott, Andrew Toth, Brian Dobson, Cole...
rating                                                         TV-PG
cast_binary        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
director_binary    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
rating_binary      [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object
netflixid                                                   80243600
title                                   Between Two Ferns: The Movie
director                                              Scott Aukerman
cast               Zach Galifianakis, Lauren Lapkus, Ryan Gaul, J...
rating                                                         TV-MA
cast_binary        [0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, ...
director_binary    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
rating_binary      [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...
Name: 154, dtype: object
In [116]:
# getting specific columns
rating_view_to_merge = netflix_data_filter.loc[:,['netflixid','rating']]

#renaming column names
rating_view_to_merge.rename(columns={'rating':'rating_num'}, inplace =True)

#mergind two dataframes
merge_for_recommendation = pd.merge(data_knn, rating_view_to_merge, on='netflixid', how='inner')

#print(merge_for_recommendation.shape) 
merge_for_recommendation.head(2)
Out[116]:
netflixid title director cast rating cast_binary director_binary rating_binary rating_num
0 80045922 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... 5.6
1 80157072 Hold the Dark Jeremy Saulnier Jeffrey Wright, Alexander Skarsgård, James Bad... TV-MA [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 5.6
In [117]:
# specifying range into new variable
new_id = range(0,merge_for_recommendation.shape[0])

#making a new column with customized sequential ids
merge_for_recommendation['new_id']=new_id

#specifying 
merge_for_recommendation=merge_for_recommendation.drop(columns = ['netflixid'])

#'original_title','genres','vote_average','genres_bin','cast_bin','new_id','director','director_bin','words_bin']]


merge_for_recommendation.head(15)
Out[117]:
title director cast rating cast_binary director_binary rating_binary rating_num new_id
0 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... 5.6 0
1 Hold the Dark Jeremy Saulnier Jeffrey Wright, Alexander Skarsgård, James Bad... TV-MA [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 5.6 1
2 Joseph: King of Dreams Rob LaDuca, Robert C. Ramirez Ben Affleck, Mark Hamill, Richard Herd, Mauree... TV-PG [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 6.5 2
3 Swearnet: The Movie Warren P. Sonoda Mike Smith, John Paul Tremblay, Robb Wells, Pa... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... 5.9 3
4 Gaga: Five Foot Two Chris Moukarbel Lady Gaga TV-MA [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 7.0 4
5 Barbie Dolphin Magic Conrad Helten Erica Lindbeck, Shannon Chan-Kent, Kazumi Evan... TV-G [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 5.7 5
6 Keith Richards: Under the Influence Morgan Neville Keith Richards TV-PG [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 7.2 6
7 Cedric the Entertainer: Live from the Ville Troy Miller Cedric the Entertainer TV-MA [0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 5.9 7
8 Cowspiracy: The Sustainability Secret Kip Andersen, Keegan Kuhn Kip Andersen NR [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... 8.3 8
9 Jeff Dunham: Relative Disaster Michael Simon Jeff Dunham TV-MA [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... NaN 9
10 Garbage Qaushiq Mukherjee Tanmay Dhanania, Trimala Adhikari, Satarupa Da... TV-MA [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 3.3 10
11 Brother's Shadow Todd S. Yellin Scott Cohen, Judd Hirsch, Susan Floyd, Elliot ... TV-14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... 6.1 11
12 Little Evil Eli Craig Adam Scott, Evangeline Lilly, Bridget Everett,... TV-MA [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... 5.7 12
13 A Noble Intention Joram Lürsen Gijs Scholten van Aschat, Jacob Derwig, Rifka ... NR [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... 6.7 13
14 Bon Bini Holland Jelle de Jonge Jandino Asporaat, Liliana de Vries, Teun Kuilb... TV-14 [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... 5.6 14
In [118]:
for ind, merge_for_recommend in merge_for_recommendation.iterrows():
    print(merge_for_recommend)
    
    break
title                                                        6 Years
director                                               Hannah Fidell
cast               Taissa Farmiga, Ben Rosenfield, Lindsay Burdge...
rating                                                            NR
cast_binary        [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
director_binary    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
rating_binary      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...
rating_num                                                       5.6
new_id                                                             0
Name: 0, dtype: object
In [146]:
merge_for_recommendation = merge_for_recommendation.dropna()
merge_for_recommendation.shape
Out[146]:
(287, 9)
In [147]:
def getNeighbors(standard_movie_to_comp, K):
        distances = []
        
        #using for loop to acess the index no of rows and the data of rows with the columnnames
        for ind, merge_for_recommend in merge_for_recommendation.iterrows():
            
            #checking if the value of id of series is matched with the user entered value of the series
            if merge_for_recommend['new_id'] != standard_movie_to_comp['new_id'].values[0]:
                
                #calling similarity function to find the disance between the user selected title and the title located in dataframe
                dist = Similarity(standard_movie_to_comp['new_id'].values[0], merge_for_recommend['new_id'])
                
                #appending the value of distances 
                distances.append((merge_for_recommend['new_id'], dist))
        
        #sorting the distances, constructing iterable object and fetching the 1st element out of it.
        distances.sort(key=operator.itemgetter(1))
        neighbors = []  #empty list
    
        #using for loop to append the distances 
        for x in range(K):
            neighbors.append(distances[x])
        return neighbors
In [160]:
def predict_rating():
    # asking title from the user
    #using try and except block
    try:
        name_movie = input('Enter a movie title: ')
        print('')

        #looking for the users title in the dataframe. Once the title is found picking whole row and converting it to dataframe and transposing it

        new_movie = merge_for_recommendation[merge_for_recommendation['title'].str.contains(name_movie)].iloc[0].to_frame().T
        print('Selected Movie: ',new_movie.title.values[0])
        
    
        K = 5 #supposing value of k to be 5 and Rating to be 0
        Rating = 0
        neighbors = getNeighbors(new_movie, K)  #calling function that will get the user input and title and find the distance


        for i in neighbors:  #using for loop to find the rating by summing all the rating distances
            Rating = Rating+merge_for_recommendation.iloc[i[0]][7] 
        
        print('\n')
        Rating = Rating/K  #the predicted rating
        print('The predicted rating for %s is %f' %(new_movie['title'].values[0],Rating))
        print('The actual rating for %s is %f' %(new_movie['title'].values[0],new_movie['rating_num']))
    
        #print(f"The predicted rating for {new_movie['title'].values[0]} is {Rating}")
        #print(f"The actual rating for {new_movie['title'].values[0]} is {new_movie['rating_num']}")

    
    except Exception:
        print("\nYour movie can not be processed/found for rating prediction. Please enter another movies like \n"
                     "Hold the Dark\n"
                    "Little Evil")
        predict_rating()
    
In [ ]:
 
In [162]:
predict_rating()

#try the following:
    #Hold the Dark
    #Gaga: Five Foot Two
    # Time Trap
    #Supergirl
    #Little Evil
Enter a movie title: Time Trap


Your movie can not be processed/found for rating prediction. Please enter another movies like 
Hold the Dark
Little Evil
Enter a movie title: Little Evil

Selected Movie:  Little Evil


The predicted rating for Little Evil is 5.820000
The actual rating for Little Evil is 5.700000

KNN classifications for rating by director

In [163]:
#dropping duplicates
n_data_filter =n_data.drop_duplicates(subset=['netflixid','title'])
netflix_data_filter =netflix_dat.drop_duplicates(subset=['netflixid','title'])

print(netflix_data_filter.duplicated().any()) #again checking if any duplicates left
print(n_data_filter.duplicated().any()) # checking andy duplicates 
False
False
In [164]:
copy_count_rated.head(2)
Out[164]:
netflixid type title director cast country date_added release_year rating duration listed_in description added_year added_month rating_num unogsdate rating_standard
0 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States 2018-09-08 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob... 2018.0 9.0 7.8 NaT High-rated
1 80045922 Movie 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... United States 2015-09-08 2015 NR 80 min Dramas, Independent Movies, Romantic Movies As a volatile young couple who have been toget... 2015.0 9.0 5.6 NaT Low-rated
In [165]:
#getting specific columns
rating_merge_classify = copy_count_rated.loc[:,['netflixid','rating_standard','rating_num']]
#merging two dataframes
merge_for_classification = pd.merge(data_knn, rating_merge_classify, on='netflixid', how='inner')

merge_for_classification.head(2)
(303, 10)
Out[165]:
netflixid title director cast rating cast_binary director_binary rating_binary rating_standard rating_num
0 80045922 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... Low-rated 5.6
1 80157072 Hold the Dark Jeremy Saulnier Jeffrey Wright, Alexander Skarsgård, James Bad... TV-MA [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... Low-rated 5.6
In [166]:
#merge_for_classification.director.to_list()
In [167]:
#LabelEncoder is a utility class to help normalize labels such that they contain only values between 0 and n_classes-1 
from sklearn import preprocessing

#creating labelEncoder, this will encode the y(target) variables
le = preprocessing.LabelEncoder()

# Converting string labels into numbers.
rating_encoded=le.fit_transform(merge_for_classification['rating'])
merge_for_classification['rating_encoded'] = rating_encoded
casting_encoded = le.fit_transform(merge_for_classification['cast'])
merge_for_classification['casting_encoded'] = casting_encoded
director_encoded = le.fit_transform(merge_for_classification['director'])
merge_for_classification['director_encoded'] = director_encoded
rating_standard_encoded = le.fit_transform(merge_for_classification['rating_standard'])
merge_for_classification['rating_standard_encoded'] = rating_standard_encoded
In [168]:
merge_for_classification
#.rating_standard_encoded.to_list()
Out[168]:
netflixid title director cast rating cast_binary director_binary rating_binary rating_standard rating_num rating_encoded casting_encoded director_encoded rating_standard_encoded
0 80045922 6 Years Hannah Fidell Taissa Farmiga, Ben Rosenfield, Lindsay Burdge... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... Low-rated 5.6 0 257 86 1
1 80157072 Hold the Dark Jeremy Saulnier Jeffrey Wright, Alexander Skarsgård, James Bad... TV-MA [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... Low-rated 5.6 6 113 114 1
2 60003155 Joseph: King of Dreams Rob LaDuca, Robert C. Ramirez Ben Affleck, Mark Hamill, Richard Herd, Mauree... TV-PG [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Low-rated 6.5 7 20 208 1
3 70304191 Swearnet: The Movie Warren P. Sonoda Mike Smith, John Paul Tremblay, Robb Wells, Pa... NR [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ... Low-rated 5.9 0 191 249 1
4 80196586 Gaga: Five Foot Two Chris Moukarbel Lady Gaga TV-MA [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ... High-rated 7.0 6 155 37 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 70245163 Call the Midwife Philippa Lowthorpe Vanessa Redgrave, Bryony Hannah, Helen George,... TV-14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... High-rated 8.4 4 272 197 0
299 80062047 Degrassi: Next Class Stefan Brogren Amanda Arcuri, Amir Bageria, Soma Bhatia, Jami... TV-14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... High-rated 7.0 4 8 230 0
300 70180293 The Cat in the Hat Knows a Lot About That! Tony Collingwood Martin Short, Alexa Torrington, Jacob Ewaniuk,... TV-Y [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ... Low-rated 6.6 8 180 244 1
301 70204981 Fullmetal Alchemist: Brotherhood Yasuhiro Irie Romi Park, Rie Kugimiya, Megumi Takamoto, Shin... TV-14 [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... High-rated 9.1 4 224 257 0
302 70142436 Merlin James Hawes Colin Morgan, Bradley James, Katie McGrath, An... TV-PG [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... High-rated 7.9 7 48 100 0

303 rows × 14 columns

In [175]:
#defining x values and y values
X = merge_for_classification[['casting_encoded','director_encoded']].values
y= merge_for_classification['rating_standard_encoded'].to_numpy()
In [170]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

def classify_rating(c,d):
    
    X = merge_for_classification[['casting_encoded','director_encoded']].values
    y= merge_for_classification['rating_standard_encoded'].values

    #print(X)
    #print(y)

    # Split dataset into training set and test set
    X_train_first, X_test_first, y_train_first, y_test_first = train_test_split(X, y, train_size = 0.8,test_size=0.2, random_state=4) # 70% training and 30% test


    from sklearn.neighbors import KNeighborsClassifier
    # Create the knn model.
    # Look at the five closest neighbors.
    knn = KNeighborsClassifier(n_neighbors=5)
    # Fit the model on the training data.
    knn.fit(X_train_first, y_train_first)
    # Make point predictions on the test set using the fit model.
    #predictions = knn.predict(X_test_first)

    predicted= knn.predict([[c,d]]) 

    
    #print(predicted)
    #print(metrics.accuracy_score(y_test_first, predictions))
    
    print(" ")
    
    director_name = merge_for_classification[merge_for_classification['director_encoded'] == d].iloc[0][2]
    movie_title = merge_for_classification[merge_for_classification['casting_encoded'] == c].iloc[0][1]
    
    #print(f"The rating standard is: {predicted}")
    if predicted == [1]:
        print(f"If '{director_name}' directs '{movie_title}', the rating standard will be Low-rated.")
    else:
        print(f"If '{director_name}' directs '{movie_title}', the rating standard will be High-rated.")
    
    
    
In [171]:
def get_cast_director_info():
    title = input("Enter a title name: ")
    print(" ")
    
    #getting title according to user
    new_title = merge_for_classification[merge_for_classification['title'].str.contains(title)].iloc[0].to_frame().T
    
    #getting encoded values from the user selection
    cast_encoded = new_title.casting_encoded.values[0]
    print(f"The cast name for this movie are: \n------{new_title.cast.values[0]}")
    print(" ")
    print(f"This movie/show is {new_title.rating_standard.values[0]}.")
    
    #asking for the director
    director =input("Enter a director name that you would like to see this movie directed: ")
    #new_movie = merge_for_classification[merge_for_classification['cast'].str.contains(name)].iloc[0].to_frame().T
    
    #getting movie data from director
    n_movie = merge_for_classification[merge_for_classification['director'].str.contains(director)].iloc[0].to_frame().T
    
    #getting director encoded
    direct_encoded = n_movie.director_encoded.values[0]
    
    return cast_encoded, direct_encoded

# Taissa Farmiga, Ben Rosenfield, Lindsay Burdge, Joshua Leonard, Jennifer Lafleur, 
#Peter Vack, Dana Wheeler-Nicholson, Jason Newman, Molly McMichael'
# Jay Karas
In [172]:
##### note: use title name 6 Years. It is Low-rated and the director named Jay Karas will make it high-rated.
In [174]:
#calling a function
cast,director = get_cast_director_info()
classify_rating(cast, director)
Enter a title name: Little Evil
 
The cast name for this movie are: 
------Adam Scott, Evangeline Lilly, Bridget Everett, Owen Atlas, Chris D'Elia, Donald Faison, Clancy Brown, Tyler Labine, Kyle Bornheimer, Carla Gallo, Brad Williams, Sally Field
 
This movie/show is Low-rated.
Enter a director name that you would like to see this movie directed: Jay Karas
 
If 'Jay Karas' directs 'Little Evil', the rating standard will be Low-rated.
In [ ]: